### Tools for Handling Hexadecimals as unique characters 
# allows proper BPE tokenizer
# since we don't want '15 200' to be tokenized as ['1', '5 2','00']
# now we get that '15 200' is 'Fđ' which can be tokenized as ['F', 'đ'] or ['Fđ']
## I removed all control, whitespace, and punctuation characters which could cause issues due to post-processing by HuggingFace tokenizers

# Using standard ASCII printable characters where possible, and extending with Unicode symbols
decimal_to_char = {
    0: '0',   1: '1',   2: '2',   3: '3',   4: '4',   5: '5',   6: '6',   7: '7',
    8: '8',   9: '9',   10: 'A',  11: 'B',  12: 'C',  13: 'D',  14: 'E',  15: 'F',
    16: 'G',  17: 'H',  18: 'I',  19: 'J',  20: 'K',  21: 'L',  22: 'M',  23: 'N',
    24: 'O',  25: 'P',  26: 'Q',  27: 'R',  28: 'S',  29: 'T',  30: 'U',  31: 'V',
    32: 'W',  33: 'X',  34: 'Y',  35: 'Z',  36: 'a',  37: 'b',  38: 'c',  39: 'd',
    40: 'e',  41: 'f',  42: 'g',  43: 'h',  44: 'i',  45: 'j',  46: 'k',  47: 'l',
    48: 'm',  49: 'n',  50: 'o',  51: 'p',  52: 'q',  53: 'r',  54: 's',  55: 't',
    56: 'u',  57: 'v',  58: 'w',  59: 'x',  60: 'y',  61: 'z',  62: 'Ŏ',  63: 'ŏ',
    64: 'ő',  65: 'Ǵ',  66: 'ǵ',  67: 'Ƕ',  68: 'ƕ',  69: 'Ǻ',  70: 'ǻ',  71: 'Ǽ',
    72: 'ǽ',  73: 'Ǿ',  74: 'ǿ',  75: 'Ȁ',  76: 'ȁ',  77: 'Ȃ',  78: 'ȃ',  79: 'Ȅ',
    80: 'ȅ',  81: 'Ȇ',  82: 'ȇ',  83: 'Ȉ',  84: 'ȉ',  85: 'Ȋ',  86: 'ȋ',  87: 'Ȍ',
    88: 'ȍ',  89: 'Ȏ',  90: 'ȏ',  91: 'Ȑ',  92: 'ȑ',  93: 'Ȓ',  94: 'ȓ',  95: 'Ȕ',
    96: 'ȕ',  97: 'Ȗ',  98: 'ȗ',  99: 'Ș', 100: 'ș', 101: 'Ț', 102: 'ț', 103: 'Ŵ',
    104: 'ŵ', 105: 'Ŷ', 106: 'ŷ', 107: 'Ÿ', 108: 'Ź', 109: 'ź', 110: 'Ż', 111: 'ż',
    112: 'Ž', 113: 'ž', 114: 'ſ', 115: 'ƀ', 116: 'Ɓ', 117: 'Ƃ', 118: 'ƃ', 119: 'À',
    120: 'Á', 121: 'Â', 122: 'Ã', 123: 'Ä', 124: 'Å', 125: 'Æ', 126: 'Ç', 127: 'È',
    128: 'É', 129: 'Ê', 130: 'Ë', 131: 'Ì', 132: 'Í', 133: 'Î', 134: 'Ï', 135: 'Ð',
    136: 'Ñ', 137: 'Ò', 138: 'Ó', 139: 'Ô', 140: 'Õ', 141: 'Ö', 142: 'Ɖ', 143: 'Ø',
    144: 'Ù', 145: 'Ú', 146: 'Û', 147: 'Ü', 148: 'Ý', 149: 'Þ', 150: 'ß', 151: 'à',
    152: 'á', 153: 'â', 154: 'ã', 155: 'ä', 156: 'å', 157: 'æ', 158: 'ç', 159: 'è',
    160: 'é', 161: 'ê', 162: 'ë', 163: 'ì', 164: 'í', 165: 'î', 166: 'ï', 167: 'ð',
    168: 'ñ', 169: 'ò', 170: 'ó', 171: 'ô', 172: 'õ', 173: 'ö', 174: 'Ɗ', 175: 'ø',
    176: 'ù', 177: 'ú', 178: 'û', 179: 'ü', 180: 'ý', 181: 'þ', 182: 'ÿ', 183: 'Ā',
    184: 'ā', 185: 'Ă', 186: 'ă', 187: 'Ą', 188: 'ą', 189: 'Ć', 190: 'ć', 191: 'Ĉ',
    192: 'ĉ', 193: 'Ƅ', 194: 'ċ', 195: 'Č', 196: 'č', 197: 'Ď', 198: 'ď', 199: 'Đ',
    200: 'đ', 201: 'Ē', 202: 'ē', 203: 'Ĕ', 204: 'ĕ', 205: 'Ė', 206: 'ė', 207: 'Ę',
    208: 'ę', 209: 'Ě', 210: 'ě', 211: 'Ĝ', 212: 'ĝ', 213: 'Ğ', 214: 'ğ', 215: 'ƅ',
    216: 'ġ', 217: 'Ģ', 218: 'ģ', 219: 'Ĥ', 220: 'ĥ', 221: 'Ħ', 222: 'ħ', 223: 'Ĩ',
    224: 'ĩ', 225: 'Ī', 226: 'ī', 227: 'Ĭ', 228: 'ĭ', 229: 'Į', 230: 'į', 231: 'İ',
    232: 'ı', 233: 'Ĳ', 234: 'ĳ', 235: 'Ĵ', 236: 'ĵ', 237: 'Ķ', 238: 'ķ', 239: 'ĸ',
    240: 'Ĺ', 241: 'ĺ', 242: 'Ļ', 243: 'ļ', 244: 'Ľ', 245: 'ľ', 246: 'Ŀ', 247: 'ŀ',
    248: 'Ł', 249: 'ł', 250: 'Ń', 251: 'ń', 252: 'Ņ', 253: 'ņ', 254: 'Ň', 255: 'ň',
    256: 'ŉ', 257: 'Ō', 258: 'ō'
}
# keep this one seperate: Ŧ (we will use this as token prefix to identify BPE byte tokens, its a cool viking alphabet symbol I think)

#assert len(np.unique(decimal_to_char.items())[0]) == len(decimal_to_char) # Test passed
char_to_decimal = {v: k for k, v in decimal_to_char.items()} # invert dictionary

def hex_to_char(x, offset=''):
  if isinstance(x, list):
    return ''.join([offset + decimal_to_char[i] for i in x])
  else:
    return decimal_to_char[x]

def char_to_hex(x):
  if len(x) > 1:
    return [char_to_decimal[i] for i in x]
  else:
    return char_to_decimal[x]

# Special tokens
pad = hex_to_char(256) # not used
bos = hex_to_char(257)
eos = hex_to_char(258)

